{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys, random, os, pickle\n",
    "from multiprocessing import Pool\n",
    "from sklearn.svm import SVC\n",
    "from sklearn import metrics\n",
    "from sklearn.externals import joblib\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "if not os.path.exists('model'):\n",
    "    os.mkdir('model')\n",
    "if not os.path.exists('preds'):\n",
    "    os.mkdir('preds')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_x_d_cols = pd.read_csv('./rank_d_feature_score.csv')\n",
    "train_x_d_cols = list(train_x_d_cols.iloc[10:810].feature)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "正负样本不均衡 {0.0: 31267, 1.0: 2198}\n",
      "CPU times: user 13.7 s, sys: 1.69 s, total: 15.4 s\n",
      "Wall time: 15.4 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "#use rank_d  and rank_nd feature\n",
    "#load data\n",
    "train_x_date = pd.read_csv('../../preprocess_data/train_x_date.csv').drop(columns=['id'])\n",
    "train_x_null = pd.read_csv('../../preprocess_data/train_x_null.csv').drop(columns=['id'])\n",
    "train_x_int = pd.read_csv('../../preprocess_data/train_x_int.csv').drop(columns=['id','tag']) # valid中无‘tag’\n",
    "train_x_d = pd.read_csv('../../preprocess_data/train_x_float_rank_d.csv',usecols=train_x_d_cols)\n",
    "train_x_nd = pd.read_csv('../../preprocess_data/train_x_float_rank_nd.csv').drop(columns=['id'])\n",
    "\n",
    "train_x = pd.concat([train_x_date,train_x_null,train_x_int,train_x_d,train_x_nd],axis=1,ignore_index=True,copy=False)\n",
    "train_y = pd.read_csv('../../preprocess_data/train_y_33465.csv')\n",
    "print('正负样本不均衡',train_y.label.value_counts().to_dict())\n",
    "\n",
    "train_x = train_x.values\n",
    "train_y = train_y.label.values\n",
    "\n",
    "\n",
    "valid_date = pd.read_csv('../../preprocess_data/valid_date.csv')\n",
    "valid_null = pd.read_csv('../../preprocess_data/valid_null.csv').drop(columns=['id'])\n",
    "valid_int = pd.read_csv('../../preprocess_data/valid_int.csv').drop(columns=['id'])\n",
    "valid_d = pd.read_csv('../../preprocess_data/valid_float_rank_d.csv',usecols=train_x_d_cols)\n",
    "valid_nd = pd.read_csv('../../preprocess_data/valid_float_rank_nd.csv').drop(columns=['id'])\n",
    "valid = pd.concat([valid_date,valid_null,valid_int,valid_d,valid_nd],axis=1,ignore_index=True,copy=False)\n",
    "# pd.concat后特征名没有了，变成数字了\n",
    "valid_id = valid.iloc[:,0].values\n",
    "valid = valid.drop(0,axis=1)\n",
    "test = valid.values\n",
    "test_id = valid_id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def pipeline(iteration,C,gamma,random_seed):\n",
    "    clf = SVC(C=C,kernel='rbf',gamma=gamma,probability=True,cache_size=8096,class_weight='balanced',verbose=True,random_state=random_seed)\n",
    "    clf.fit(train_x,train_y)\n",
    "    joblib.dump(clf, './model/svm{0}.pkl'.format(iteration))\n",
    "\n",
    "    pred = clf.predict_proba(test)\n",
    "    test_result = pd.DataFrame(columns=[\"id\",\"score\"])\n",
    "    test_result.id = test_id\n",
    "    test_result.score = pred[:,1]\n",
    "    test_result.to_csv('./preds/svm_pred{0}.csv'.format(iteration),index=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "random_seed = [x for x in range(2016,2046)]\n",
    "C = [i/10.0 for i in range(10,40)]\n",
    "gamma = [i/1000.0 for i in range(1,31)]\n",
    "random.shuffle(random_seed)\n",
    "random.shuffle(C)\n",
    "random.shuffle(gamma)\n",
    "\n",
    "with open('./params.pkl','wb') as f:\n",
    "    pickle.dump((random_seed,C,gamma),f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#mp version\n",
    "pool = Pool(30)\n",
    "for i in range(30):\n",
    "    pool.apply_async(pipeline,args=(i,C[i],gamma[i],random_seed[i]))\n",
    "pool.close()\n",
    "pool.join()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
